output_results = "../results/"
output_data = "../results/"
ps_deseq <- readRDS(paste0(output_results, "Filtered/ps_deseq_friedfilt.rds"))
ps_css <- readRDS(paste0(output_results,"Filtered/ps_css_friedfilt.rds"))
ps_no_norm <- readRDS(paste0(output_results,"Filtered/ps_no_norm_friedfilt.rds"))
ps_not_norm_comp <- readRDS(paste0("../data/ps_no_norm_age_bf_filt_complete_family.rds"))
#remove samples that are too young and their paired sibling
tooyoung <-ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Age..months. < 24)]
#family 65 should be removed according to >24 months criteria
ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != unique(tooyoung), ps_not_norm_comp)
#List of individuals that were reported w/ autism, but was not classified as such through MARA and/or video classifier
pheno_contrad <-read.csv("../phenotype_contradictions.csv")
contradicting<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% as.character(pheno_contrad$host_name))])
#remove the whole family
for (i in contradicting){
ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)
}
#Remove Breastfed and their families
breast_fed <- c("089_A","054_N", "158_N" )
b_fed<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% breast_fed)])
#remove the whole family
for (i in b_fed){
ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)
}
Compare resulting amplicon data between and within sample types by canonical correlation analysis, regression profiling, and visualization (e.g. non-metric multi-dimensional scaling [NMDS], principle coordinates of analysis, principle component analysis).
plotting_consPcoA <- function(ps, norm, variables){
#drop samples with NA values in contraints
keep <- as.vector(apply(ps@sam_data[, variables], 1, function(x) return(!any(is.na(x)))))
ps <- prune_samples(keep, ps)
#Ease of labeling
colnames(ps@sam_data)[colnames(ps@sam_data) == variables] <- gsub("frequency...longitudinal.", "", variables)
variables <- gsub("frequency...longitudinal.", "", variables)
colnames(ps@sam_data)[colnames(ps@sam_data) == variables] <- gsub("frequency.", "", variables)
variables <- gsub("frequency.", "", variables)
#include only families that have all 6 timepoints, 3 for ASD and 3 for NT participants, for ease of visualization
fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
sample_data(ps_6fam)$Family.group.ID <- paste0('fam', as.character(sample_data(ps_6fam)$Family.group.ID))
formula <- as.formula(paste("~", variables))
if(length(variables) > 1){
formula <- as.formula(paste("~", paste(variables, collapse = "+")))
}
ps_pcoa_ord <- ordinate(
physeq = ps_6fam,
method = "CAP",
distance = "bray",
formula = formula
)
p <- plot_ordination(
physeq = ps_6fam,
ordination = ps_pcoa_ord,
color = variables,
axes = c(1,2),
title= paste("Constrained PcoA",norm,"ordinated by \n", paste(variables, collapse = "\n"))) +
geom_point( size = 2) +
theme_minimal()+
theme(text = element_text(size =10), plot.title = element_text(size=10))
if(variables == "phenotype"){
p <- p + scale_color_manual(values=sgColorPalette)
}
#sum_pcoA_DesEq<-summary(ps_pcoa_ord)
erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
sampledf <- data.frame(sample_data(ps))
beta_di<-betadisper(erie_bray_sum_pcoA, ps@sam_data$Family.group.ID)
to_return<-list()
to_return[[1]]<-p
to_return[[2]]<-beta_di
return(to_return)
}
#With Deseq
DeSeq_distance <- plotting_consPcoA(ps_deseq, "Deseq", variables = "phenotype")
# plot
DeSeq_distance[[1]]
#same with CSS
CSS_distance <- plotting_consPcoA(ps_css, "CSS", variables = "phenotype")
# plot
CSS_distance[[1]]
#With Deseq
DeSeq_distance <- plotting_consPcoA(ps_deseq, "Deseq", variables = "Family.group.ID")
# plot
DeSeq_distance[[1]]
#same with CSS
CSS_distance <- plotting_consPcoA(ps_css, "CSS", variables = "Family.group.ID")
# plot
CSS_distance[[1]]
#the distance in those plot?
#average_distance_to_median
#pdf(file=paste0(output_data, "Figures/Distance_DeSeq_CSS_", Sys.Date(), ".pdf"))
boxplot(DeSeq_distance[[2]]$distances,CSS_distance[[2]]$distances, names=c("DeSeq", "CSS"),
xlab = "Type of Normalization", ylab = "Distance on Component 1 & 2", main ="Intragroup distance for each family")
#dev.off()
permanova_res <- read.csv(paste0(output_data, "PERMANOVA_noLR.csv"))
permanova_res <- permanova_res[2:nrow(permanova_res), ] #First only include 95 samples because optional question
r2_min <- .02
n_sample_min <- 400
variables <- as.character(permanova_res[permanova_res$TotalN > n_sample_min & permanova_res$R2 > r2_min, ]$Variable)
plot_list <- list()
for(variable in variables){
res <- plotting_consPcoA(ps_deseq, "Deseq", variables = variable)
plot_list[[variable]] <- res[[1]]
}
plot_grid(plotlist = plot_list, nrow = 2, ncol = 2)
pdf(paste0(output_data, 'pcoa_top_permanova_variables.pdf'), width = 12, height = 9)
plot_grid(plotlist = plot_list, nrow = 2, ncol = 2)
dev.off()
## png
## 2
Characterize and assess the diversity of each sample, and evaluate the extent of dissimilarity between the cohorts
ER <- estimate_richness(ps_not_norm_comp, measures=c("Observed", "Chao1", "Shannon"))
ER <- cbind(ER, sample_data(ps_not_norm_comp)[row.names(ER), c("phenotype", "Family.group.ID", "Within.study.sampling.date")])
ER <- data.table(ER, keep.rownames = TRUE)
ER_long <- melt(ER, id.vars=c('rn', 'phenotype', "Family.group.ID", "Within.study.sampling.date"))
# plot
ggplot(data=ER_long[variable!='se.chao1'], aes(x=phenotype, y=value, fill=phenotype))+
geom_boxplot(width=0.7, outlier.colour='white')+
geom_jitter(size=1, position=position_jitter(width=0.1))+
xlab('')+ylab('')+
scale_fill_manual(values=sgColorPalette)+
theme_minimal()+facet_wrap(~variable, scales='free')
# run t-test to check significance
ttest=NULL
for(alphad in c('Observed', 'Chao1', 'Shannon')){
ttest_res=t.test(value ~ phenotype, data=ER_long[ER_long$variable==alphad], var.equal=TRUE)
ttest=rbindlist(list(ttest, data.table(alpha_index=alphad, pvalue=ttest_res$p.value)))
}
pander(ttest)
| alpha_index | pvalue |
|---|---|
| Observed | 0.1166 |
| Chao1 | 0.1166 |
| Shannon | 0.3603 |
#Let's do a PcoA #not much differences
GP.ord <- ordinate(ps_deseq, "PCoA", "bray")
p2 = plot_ordination(ps_deseq, GP.ord, type="samples", color="phenotype") +
geom_point( size = 1)+
scale_color_manual(values=sgColorPalette)+
theme_minimal()
p2
# function to plot PCoA without NA points
wo_na_pcoa <- function(ps, pvar, ord_res){
# ord_res: ordinated object
keepid=!is.na(get_variable(ps, pvar)) &
get_variable(ps, pvar)!='NA' &
get_variable(ps, pvar)!=''
tmp_ps <- prune_samples(keepid, ps)
# get subset counts and metadata together
DF <- cbind(ord_res$vectors[row.names(sample_data(tmp_ps)), 1:2], sample_data(tmp_ps)[, pvar])
setnames(DF, pvar, 'testvar')
# get eigenvalues
eig=(ord_res$values$Eigenvalues/sum(ord_res$values$Eigenvalues))[1:2]*100
p <- ggplot(data=DF, aes(x=Axis.1, y=Axis.2, colour=testvar))+
geom_point(size=2)+
ggtitle(pvar)+
xlab(paste0('Axis.1 [', format(eig[1], digits=3), '%]'))+
ylab(paste0('Axis.2 [', format(eig[2], digits=3), '%]'))+
theme_minimal()+
theme(legend.title=element_blank(), legend.position="bottom")
print(p)
}
ord <- ordinate(ps_css, method = "PCoA", distance = "bray")
wo_na_pcoa(ps_css, 'Mobile.Autism.Risk.Assessment.Score', ord)
wo_na_pcoa(ps_deseq, 'Probiotic..consumption.frequency.', GP.ord)
# Anti.infective
wo_na_pcoa(ps_deseq, 'Anti.infective', GP.ord)
# Minimum.time.since.antibiotics
sample_data(ps_deseq)$Minimum.time.since.antibiotics <- as.numeric(sample_data(ps_deseq)$Minimum.time.since.antibiotics)
wo_na_pcoa(ps_deseq, 'Minimum.time.since.antibiotics', GP.ord)
permanova_pcut = 0.05
permanova_cut = 0.02
permanova_res <- read.csv(paste0(output_data, "PERMANOVA_noLR.csv"))
for(pvar in permanova_res[permanova_res$R2>permanova_cut & permanova_res$pvalue<permanova_pcut]$Variable){
wo_na_pcoa(ps_deseq, pvar, GP.ord)
}